# TF-IDF example

### Function for calculating the relative frequency (term frequency or tf)

In [None]:
# Example documents
docs = [
    "this is a sample document",
    "this document is another example",
    "this example document is different"
]

# Calulate tf
def compute_tf(doc):
    tf_dict = {}
    words = doc.split()
    total_words = len(words)
    for word in words:
        tf_dict[word] = tf_dict.get(word, 0) + 1
    for word in tf_dict:
        tf_dict[word] = tf_dict[word] / total_words
    return tf_dict

for doc in docs:
    print (compute_tf(doc))

## Function to calculate the inverse document frequency (inverse document frequency or idf)

IDF is a measure of how important a word is to one document in a collection (or corpus) of documents.
The intuition behind IDF is that words that appear in many documents are less informative than those that appear in fewer documents.


In [None]:
docs = [
    "this is a sample document",
    "this document is another example",
    "this example document is different"
]


import math

# Calculate idf
def compute_idf(docs):
    idf_dict = {}
    total_docs = len(docs)
    for doc in docs:
        words = set(doc.split())
        for word in words:
            idf_dict[word] = idf_dict.get(word, 0) + 1
    for word in idf_dict:
        idf_dict[word] = math.log(total_docs / float(idf_dict[word]))
    return idf_dict

print (compute_idf(docs))

## Functions to calculate tf-idf

TF-IDF stands for Term Frequency-Inverse Document Frequency and is used to assess the importance of a word in a document in relation to a collection of documents (corpus).


In [None]:
docs = [
    "this is a sample document",
    "this document is another example",
    "this example document is different"
]

import math

# Step 1: compute TF
def compute_tf(doc):
    tf_dict = {}
    words = doc.split()
    total_words = len(words)
    for word in words:
        tf_dict[word] = tf_dict.get(word, 0) + 1
    for word in tf_dict:
        tf_dict[word] = tf_dict[word] / total_words
    return tf_dict

# Step 2: compute IDF
def compute_idf(docs):
    idf_dict = {}
    total_docs = len(docs)
    for doc in docs:
        words = set(doc.split())
        for word in words:
            idf_dict[word] = idf_dict.get(word, 0) + 1
    for word in idf_dict:
        idf_dict[word] = math.log(total_docs / float(idf_dict[word]))
    return idf_dict

# Stepn 3: compute TF-IDF
def compute_tfidf(tf, idf):
    tfidf = {}
    for word in tf:
        tfidf[word] = tf[word] * idf.get(word, 0)
    return tfidf

# Compute TF for each document
tf_list = [compute_tf(doc) for doc in docs]

# Compute IDF for all documents
idf = compute_idf(docs)

# Compute TF-IDF for each document
tfidf_list = [compute_tfidf(tf, idf) for tf in tf_list]

# print the results
for i, tfidf in enumerate(tfidf_list):
    print(f"TF-IDF for dokument {i+1}:")
    for word, score in tfidf.items():
        print(f"  {word}: {score:.4f}")

import pandas as pd
# Send the TF-IDF results into a DataFrame
df = pd.DataFrame(tfidf_list).fillna(0)
print ('\n')
df